# Loading packages
library(pacman)
p_load(
  data.table, here, ggplot2, fixest, magrittr, 
  dplyr, ggExtra, haven, purrr
)

theme_set(hrbrthemes::theme_ipsum())

# Raw google and deep solar data
google_dt = fread(here("Data/google-project-sunroof/project-sunroof-census_tract.csv")) %>% 
  .[,region_name := as.character(region_name)]
deep_solar_dt = fread(here("Data/deep-solar/deepsolar_tract.csv")) %>% 
  .[,fips := as.character(fips)]

# Merged data from Mark
merged_dt = 
  cbind(
    fread(here("Data/CleanedData/MergedData.csv")), 
    read_dta(here("Data/CleanedData/BI_tCollPercPolnm0Base.dta")),
    read_dta(here("Data/CleanedData/HH_tract.dta"))
  )

br = 10
merged_dt[,':='( 
  # Calculating log installs per capita
  pred_linstall_pop = log(BI_t_pred/population),
  prob_b_sim = BI_t_pred/HH_tract,
  prob_b_data = BI_t/HH_tract,
  l_prob_b_sim = log(BI_t_pred/HH_tract),
  l_prob_b_data = log(BI_t/HH_tract),
  # Creating bins for boxplots
  bin_at = cut(A_t/1e3, breaks = br),
  bin_collperc = cut(CollPerc_t, breaks = br),
  bin_pol = cut(Pol_t, breaks = br),
  bin_elec_price = cut(avg_electricity_retail_rate, breaks = br),
  bin_ptat = cut(ptAt, breaks = br),
  bin_tot_ben = cut(tot_ben, breaks = br),
  # Bins to check against Mark's results 
  bin_CollPerc_t = cut(CollPerc_t, breaks = seq(-0.75, 0.75, by = 0.25)),
  bin_tot_ben_t = cut(CollPerc_t, breaks = 1804)
)]

# Checking against mark's results 
merged_dt[,
  lapply(.SD, mean, na.rm = TRUE),
  keyby = bin_CollPerc_t, 
  .SDcols = c("l_prob_b_sim","l_prob_b_data","CollPerc_t","HH_tract", "tot_ben")
]
merged_dt[is.finite(l_prob_b_data),
  lapply(.SD, mean, na.rm = TRUE),
  keyby = tot_ben > 1804, 
  .SDcols = c("l_prob_b_sim","l_prob_b_data","CollPerc_t","HH_tract", "tot_ben")
]

~!emerged_dt2 = fread(here("Data/CleanedData/MergedData2.csv")) 

merged_dt2[,':='(
 prob_b_sim = BI_t_pred/HH_tract,
 prob_b_data = BI_t/HH_tract,
 l_prob_b_sim = log(BI_t_pred/HH_tract),
 l_prob_b_data = log(BI_t/HH_tract)
)]

# Adding census regions
#merged_dt =   
#  merge(
#     merged_dt,
#     fread(here("Data/census-regions.csv")),
#     by.x = "state", 
#     by.y = "State Code",
#     all.x = T
#   )

# -------------------------------------------------------------------
# Table w/correlation of installs w/varaibles other descr stats of installs
# -------------------------------------------------------------------
install_density_p =
  ggplot(data = merged_dt, aes(x = prob_b_data, fill = Region)) + 
  geom_density(color = NA, alpha = 0.5) + 
  scale_fill_viridis_d() + 
  scale_x_log10(labels = scales::percent_format(accuracy = 0.1)) +
  labs(
    x = "Percent install per building (Log scale)",
    y = "Density"
  )
ggsave(
  plot = install_density_p, 
  filename = here("figures/installs/install_density_region.jpeg"),
  width = 8, height = 6
)

install_density_pred = 
melt(
  merged_dt[,.(
    tract,
    `Actual Installations` = prob_b_data, 
    `Predicted Installations` = prob_b_sim
  )],
  id.vars = c("tract")
) |>
#  filter(value < 100) |>
  ggplot(aes(x = value, fill = variable)) + 
  geom_density(color = NA, alpha = 0.5) +   
  scale_fill_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) + 
  scale_x_log10(labels = scales::percent) +
  labs(
    x = "Percent install per building (Log Scale)",
    y = "Density"
  )
ggsave(
  plot = install_density_pred, 
  filename = here("figures/installs/install_density_pred.jpeg"),
  width = 8, height = 6
)


merged_dt[, .N, by= install_percDS_pop < 1]

merged_dt[!is.na(linstall_percDS_pop),c(.(
  mean = mean(linstall_percDS_pop, na.rm = T),
  median = median(linstall_percDS_pop, na.rm = T),
  max = max(install_percDS_pop, na.rm = T),
  sd = sd(linstall_percDS_pop, na.rm = T)),
  lapply(.SD, \(x){cor(linstall_percDS_pop, x)})
)
,.SDcols = c("A_t","CollPerc_t","Pol_t","avg_electricity_retail_rate","ptAt")
]


# -------------------------------------------------------------------
# Relationship of installs w/different variables 
# -------------------------------------------------------------------
p_load(locpol)
d <- data.frame(x = runif(250))
d$y <- d$x^2 - d$x + 1 + rnorm(250, sd = 0.1)
r <- locpol(y~x,d)
plot(r)


pred_vs_actual_b_plot = function(x_var, s = 75, deg = 0){
  
  # Preliminaries
  col_vec = c("tract",x_var,"HH_tract","prob_b_data","prob_b_sim")
  plot_dt = merged_dt2[,..col_vec]
  setnames(plot_dt, old = c(x_var), new = "x")
  x_lab = case_when(
    x_var == "A_t" ~ "Solar Irradiance",
    x_var == "CollPerc_t" ~ "College Percent",
    x_var == "Pol_t" ~ "Percent Democrat",
    x_var == "tot_ben" ~ "Total Monetary Benefit",
    x_var == "avg_electricity_retail_rate" ~ "Electricity Price",
    x_var == "ptAt" ~ "Electricity Price x Solar Irradiance",
    TRUE ~ x_var
  )
  
  sd_bw = sd(plot_dt$x)
  
  # Creating base plot 
  pred_p = 
    melt(
      plot_dt,
      id.vars = c("tract","x", "HH_tract")
    ) |>
    ggplot(aes(x = x, y = log(value), color = variable, weight = HH_tract)) + 
    geom_smooth(
      method = "loess", se = FALSE, span = s/100,
      method.args = list(degree = deg)
    ) +  
    geom_point(alpha = 0) + 
    #scale_y_log10(
    #  labels = scales::percent_format(accuracy = 0.1)
    #) +  
    scale_color_viridis_d(
      name = "", 
      labels = c("Actual","Predicted"),
      option = "inferno",begin = 0.25, end = 0.75
    ) +
    labs(
      x = x_lab,
      y = "Installations per House (Log scale)",
      caption = paste0("Span = ",s, ", deg = ", deg)
    ) +
    coord_cartesian(ylim = c(-7.5, -2))
  
  # Adding the margin to the   
  pred_density_p = 
    ggMarginal(
      pred_p, 
      type = "density", 
      margins = "x",
      fill = "gray"
    ) 
  
  #Saving the result
  ggsave(
    plot = pred_density_p,
    filename = here(paste0("figures/installs/",x_var,"/prob_b_s",s,".jpeg")),
    width = 8, height = 7
  )
  
  print(pred_density_p)
}

map2(
  rep("tot_ben", 9),
  seq(10, 90, by  =10),
  pred_vs_actual_b_plot
)

# Generating plots 
map(
  c("A_t","CollPerc_t","avg_electricity_retail_rate","ptAt"), #,"Pol_t","tot_ben"
  pred_vs_actual_b_plot
)



#--------------------------------------------------------------------
# Comparing actual vs predicted installations 
pred_vs_actual_install = 
  ggplot(data = merged_dt, aes(x = exp(linstall_percDS_pop), y = exp(pred_linstall_pop))) + 
  geom_smooth() +
  scale_x_log10(labels = scales::percent) +
  scale_y_log10(labels = scales::percent) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") + 
  labs(
    x = "Log installations per capita",
    y = "Predicted log installations per capita"
  )
ggsave(
  pred_vs_actual_install, 
  filename = here("figures/installs/pred_vs_actual_install.jpeg"),
  width = 8, height = 6
)

pred_vs_actual_install_b = 
  ggplot(data = merged_dt[prob_b_data>0], aes(x = prob_b_data, y = prob_b_sim)) + 
  #geom_point(alpha = 0.2) +
  geom_smooth() +
  scale_x_log10(labels = scales::percent) +
  scale_y_log10(labels = scales::percent) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") + 
  labs(
    x = "Log installations per building",
    y = "Predicted log installations per building"
  ) 
#ggsave(
  pred_vs_actual_install_b#,
#  filename = here("figures/installs/pred_vs_actual_install.jpeg"),
#  width = 8, height = 6
#)
  

# Solar Irradiance --------------------------------------------------
installs_vs_irradiance_p =   
  ggplot(data = merged_dt, aes(y = linstall_percDS_pop, x = A_t)) + 
#  geom_point(aes(y = pred_linstall_pop), alpha = 0.05, color = "red") + 
  geom_point(aes(color = Region), alpha = 0.05) +
  geom_smooth(method = "lm") +
  #geom_smooth(aes(color = Region),method = "lm") +
#  geom_smooth(aes(y = pred_linstall_pop), method = "lm", color = "red") +
  scale_color_viridis_d("Region") + 
  labs(
    y = "Log Installations per Capita",
    x = "Solar Irradiance"
  )
ggsave(
  plot = installs_vs_irradiance_p, 
  filename = here("figures/installs/installs_vs_irradiance.jpeg"),
  width = 8, height = 6
)

# Boxplot
melt(
  merged_dt[,.(tract, A_t, `Actual Installations` = linstall_percDS_pop, `Predicted Installations` = pred_linstall_pop)],
  id.vars = c("tract","A_t")
) |>
  ggplot(aes(x = A_t, y = exp(value))) + # , color = variable 
  geom_boxplot(aes(group = cut_width(A_t, 100)), outlier.alpha = 0.1, position = "dodge2") +
  scale_y_log10(labels = scales::percent_format(accuracy = 0.1)) +  
  labs(
    x = "Solar Irradiation",
    y = "Log Installations per Capita"
  ) + 
  facet_wrap(~variable)


# College Percent ---------------------------------------------------
installs_vs_collperc_p = 
  merged_dt |>
  ggplot(aes(y = linstall_percDS_pop, x = CollPerc_t)) + 
  geom_point(aes(color = region), alpha = 0.05) + 
  geom_smooth(method = "lm") +
  scale_color_viridis_d("Region") + 
  labs(
    y = "Log Installations per Capita",
    x = "College Percent"
  )
ggsave(
  plot = installs_vs_collperc_p, 
  filename = here("figures/installs/installs_vs_collperc.jpeg"),
  width = 8, height = 6
)


melt(
  merged_dt[,.(tract, bin_collperc, linstall_percDS_pop, pred_linstall_pop)],
  id.vars = c("tract","bin_collperc")
) |>
  ggplot(aes(x = bin_collperc, y = value, color = variable)) + 
  geom_boxplot(position = "dodge") +
  scale_color_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) +
  labs(
    x = "College Percent",
    y = "Log Installations per Capita"
  )


# Percent democrat --------------------------------------------------
ggplot(data = merged_dt, aes(y = linstall_percDS_pop, x = Pol_t, color = Region)) + 
  geom_point(alpha = 0.05) + 
  geom_smooth(method = "lm")+
  scale_color_viridis_d() + 
  labs(
    y = "Log Installations per Capita",
    x = "Percent Voting Democrat"
  )
ggsave(
  plot = installs_vs_democrat_p, 
  filename = here("figures/installs/installs_vs_collperc.jpeg"),
  width = 8, height = 6
)


melt(
  merged_dt[,.(tract, bin_pol, linstall_percDS_pop, pred_linstall_pop)],
  id.vars = c("tract","bin_pol")
) |>
  ggplot(aes(x = bin_pol, y = value, color = variable)) + 
  geom_boxplot(position = "dodge") +
  scale_color_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) +
  labs(
    x = "Percent Democrat",
    y = "Log Installations per Capita"
  )


# Price of electricity ----------------------------------------------
ggplot(
  data = merged_dt, 
  aes(
    y = linstall_percDS_pop, 
    x = avg_electricity_retail_rate, 
    color = Region
)) + 
  geom_point(alpha = 0.05) + 
  geom_smooth(method = "lm") +
  scale_color_viridis_d() + 
  labs(
    y = "Log Installations per Capita",
    x = "Price of Electricity"
  )
melt(
  merged_dt[,.(tract, bin_elec_price, linstall_percDS_pop, pred_linstall_pop)],
  id.vars = c("tract","bin_elec_price")
) |>
  ggplot(aes(x = bin_elec_price, y = value, color = variable)) + 
  geom_boxplot(position = "dodge") +
  scale_color_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) +
  labs(
    x = "Price of Electricity",
    y = "Log Installations per Capita"
  )


# Irradiance x Electricity price ------------------------------------
ggplot(data = merged_dt, aes(y = linstall_percDS_pop, x = ptAt, color = Region)) + 
  geom_point(alpha = 0.05) + 
  geom_smooth(method = "lm") +
  scale_color_viridis_d() + 
  labs(
    y = "Log Installations per Capita",
    x = "Price of Electricity times Solar Irradiation"
  )

melt(
  merged_dt[,.(tract, bin_ptat, linstall_percDS_pop, pred_linstall_pop)],
  id.vars = c("tract","bin_ptat")
) |>
  ggplot(aes(x = bin_ptat, y = value, color = variable)) + 
  geom_boxplot(position = "dodge") +
  scale_color_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) +
  labs(
    x = "Price of Electricity times solar irradiation",
    y = "Log Installations per Capita"
  )


# Total Monetary Benefit --------------------------------------------
ggplot(data = merged_dt, aes(y = linstall_percDS_pop, x = tot_ben, color = Region)) + 
  geom_point(alpha = 0.05) + 
  geom_smooth(method = "lm") +
  scale_color_viridis_d() + 
  labs(
    y = "Log Installations per Capita",
    x = "Total Monetary Benefit of Installation"
  )
melt(
  merged_dt[,.(tract, bin_tot_ben, linstall_percDS_pop, pred_linstall_pop)],
  id.vars = c("tract","bin_tot_ben")
) |>
  ggplot(aes(x = bin_tot_ben, y = value, color = variable)) + 
  geom_boxplot(position = "dodge") +
  scale_color_viridis_d(
    name = "", 
    labels = c("Actual","Predicted"),
    option = "inferno",begin = 0.25, end = 0.75
  ) +
  labs(
    x = "Total Monetary Benefit of Installation",
    y = "Log Installations per Capita"
  )



# Others ------------------------------------------------------------
ggplot(data = merged_dt, aes(y = linstall_percDS_pop, x = per_capita_income, color = Region)) + 
  geom_point(alpha = 0.05) + 
  #geom_smooth(method = "lm") +
  scale_color_viridis_d() + 
  labs(
    y = "Log Installations per Capita",
    x = "Income per Capita"
  )




